MODEL COMPUTATION (reference and validation data pre
processing)
Library loading, function definition and gene annotation
library(limma)
library(DESeq2)
library(biomaRt)
library(org.Hs.eg.db)
library(readxl)
library(reshape2)
library(ggplot2)
library(ggfortify)
library(ggpubr)
library(caret)
library(steFunctions)
library(EnsDb.Hsapiens.v79)
library(EnsDb.Hsapiens.v75)
library(heatmap3)
library(tidyverse)
library(ComplexHeatmap)
library(circlize)
library(doMC)
library(pROC)
#define function to get color gradient
getGradientColors <- function(colValue, minRange = NULL, maxRange = NULL, ccolors = c('green3', 'red'), bbreak = 0.01){
library(heatmap3)
if(is.null(minRange)){
minRange <- min(colValue, na.rm = T) - 0.001
}
if(is.null(maxRange)){
maxRange <- max(colValue, na.rm = T) + 0.001
}
bre <- seq(minRange, maxRange, bbreak)
colori <- colByValue(colValue, col = colorRampPalette(ccolors)(length(bre)-1), breaks= bre, cex.axis = 0.8)
colori <- as.vector(colori)
graphics.off()
names(colori) <- names(colValue)
return(colori)
}
MEDIAsave="/home/mclaudia/MEDIA Project/Codici_finali/"
wdd="/home/mclaudia/MEDIA Project/OAStratification-pipeline/"
MEDIAgraph="/home/mclaudia/MEDIA Project/Codici_finali/Paper_figures/"
load(paste0(MEDIAsave,"genide-bulk-unpaired_DE.RData"),verbose=TRUE) #for TPM duplicate fix
## Loading objects:
## OAvsnonOA
up_genes <- read_excel(paste0(MEDIAsave,"up_dw_genes_consensus.xlsx"), sheet=1)
dw_genes <- read_excel(paste0(MEDIAsave,"up_dw_genes_consensus.xlsx"), sheet=2)
ensdf <- GenomicFeatures::genes(EnsDb.Hsapiens.v79, return.type="DataFrame")
ens2gene <- as.data.frame(ensdf[,c("gene_id","gene_name")])
ensdf19 <- GenomicFeatures::genes(EnsDb.Hsapiens.v75, return.type="DataFrame")
ens2gene19 <- as.data.frame(ensdf19[,c("gene_id","gene_name")])
We use as reference Dataset 3 (Unpaired)
TPM matrix has been collected from txi-import file from Soul et al.,
2017
load(paste0(wdd,"data/txi.RData"))
patientDetails<-read.csv(paste0(wdd,"data/patientDetails_all_withMed.csv"),stringsAsFactors = F)
patientDetails <-patientDetails[,-3]
patientDetails<-patientDetails[!duplicated(patientDetails$name),]
patientDetails$OA<-ifelse(grepl("SH0", patientDetails$name),"OA","NonOA")
tmp <- txi$abundance[,match(unique(as.character(patientDetails$name)),colnames(txi$abundance))] #collect TPM matrix
keep <- rowSums(tmp > 0) >= 35 #TPMs > 0 in at least 50% of the samples
tmp <-tmp[keep,]
tmp <-cbind(gene_id=rownames(tmp), tmp)
tmp<-merge(tmp,ens2gene,by.y="gene_id")
tmp <-cbind(tmp$gene_name, tmp[,-72])
colnames(tmp)[1] <-"gene_name"
rownames(tmp) <-tmp$gene_id
dupp <-tmp[duplicated(tmp$gene_name),]
gn_dup <-unique(dupp[,1])
tpm.mat_maintain <-tmp[!tmp$gene_name %in% gn_dup,]
tpm2 <-as.data.frame(matrix(NA, length(gn_dup),ncol(tpm.mat_maintain[,-c(1,2)])))
colnames(tpm2) <-colnames(tpm.mat_maintain)[-c(1,2)]
rownames(tpm2) <-paste0("ENS_",1:length(gn_dup))
for(i in 1:length(gn_dup)){
x=gn_dup[i]
y=tmp[tmp[,1] %in% x,-c(1,2)]
tpm2[i,1:ncol(tpm2)] <-apply(y,2,function(x) round(mean(as.numeric(x)),3))
}
tpm2 <-cbind(gene_name=gn_dup,tpm2)
tpm2 <-tpm2[,colnames(tpm.mat_maintain)[-2]]
tpm3 <-rbind(tpm.mat_maintain[,-2],as.matrix(tpm2))
rownames(tpm3) <-tpm3[,1]
tpm.mat <-t(apply(tpm3[,-1],1,as.numeric))
colnames(tpm.mat) <-colnames(tpm3)[-1]
dim(tpm.mat)
## [1] 17804 70
train_tpm <-log2(tpm.mat+0.001)
boxplot(train_tpm)

Load Validation dataset from GSE114007 (see vignette 4)
load(paste0(MEDIAsave,"datasetValidation_full.RData"),verbose = TRUE)
## Loading objects:
## newdataset
Rescale validation TPM-log2 data based on reference
1. Intersect reference genes with the ones of the sample of interest
(or validation dataset)
2. quantile normalize the reference dataset
ii=intersect(rownames(train_tpm),rownames(validation_tpm)) #select common genes,14565
train_tpm <-train_tpm[ii,]
validation_tpm <-validation_tpm[ii,]
dim(train_tpm)
## [1] 14565 70
dim(validation_tpm)
## [1] 14565 38
train_tpm <-t(quantileNormalization(t(train_tpm)))
list_ranks <- as.numeric(sort(train_tpm[,1],decreasing=TRUE))
head(list_ranks)
## [1] 15.90471 14.80488 14.42682 14.01457 13.75508 13.53887
3. rescale the sample of interest for the quantile interval (or
validation dataset)
list_val <-vector("list",ncol(validation_tpm))
names(list_val) <-colnames(validation_tpm)
list_val <-apply(validation_tpm, 2, function(x) as.list(sort(x,decreasing = TRUE)))
list_valRank <-vector("list",ncol(validation_tpm))
names(list_valRank) <-colnames(validation_tpm)
for(i in 1:length(list_val)){
gg <-names(unlist(list_val[[i]]))
list_valRank[[i]] <-list_ranks
names(list_valRank[[i]]) <- gg
}
normdata2 <-matrix(NA,nrow = nrow(validation_tpm), ncol=ncol(validation_tpm))
rownames(normdata2) <-rownames(validation_tpm)
colnames(normdata2) <-colnames(validation_tpm)
for(i in 1:length(list_valRank)){
tmp <-as.data.frame(list_valRank[[i]])
rownames(tmp) <-names(list_valRank[[i]])
normdata2[,i] <-tmp[rownames(normdata2),]
}
boxplot(normdata2) #Validation re-scaled on quantile normalized reference

ELASTIC NET model (select most informative features)
- Run the model 100 times (100 x 20-elements samples, 10 random OA and
10 healthy samples)
- After, select the features occurred at least the 50% of times
- Compute mean coefficient values
- Compute scores per patient (linear combination) with the selected
features gene expression TPM values
train_tpm_forRidge <- train_tpm
validation_tpm_forRidge <- normdata2
validation_tpm <-normdata2[rownames(normdata2) %in% c(up_genes$gene,dw_genes$gene),] #select consensus genes present
dim(validation_tpm)
## [1] 43 38
train_tpm <-train_tpm[rownames(train_tpm) %in% rownames(validation_tpm),]
dim(train_tpm)
## [1] 43 70
Set 100 OA sample runs: balance data for normal samples
(bootstrap)
set.seed(7894)
runs <-vector("list",100)
names(runs) <-paste0("run_",1:100)
runs <-lapply(runs,function(x) x <- sample(11:70, 10, replace = FALSE))
subsets <-vector("list",100)
names(subsets) <-paste0("run_",1:100)
for(i in 1:length(subsets)) subsets[[i]] <-train_tpm[,c(1:10,as.numeric(runs[[i]]))] #create subsets
Train
Tune the model
trainCtrl.rpcv <- trainControl(method = "LOOCV", number = 1,
verboseIter = TRUE, returnData = FALSE,
savePredictions = TRUE,
classProbs = TRUE)
ClassBoot=as.factor(c(rep("N",10),rep("OA",10)))
list_tunes <-vector("list",100)
names(list_tunes) <-paste0("tunes_",1:100)
Train the model for 100 runs
nCores <- 150 #change number of threads if needed
registerDoMC(nCores)
for(i in 1:length(subsets)){
train_dataset <-t(subsets[[i]])
train <- train(train_dataset,
y = ClassBoot,
method = "glmnet",
trControl = trainCtrl.rpcv,
metric = "Accuracy", allowParallel=TRUE,
tuneGrid = expand.grid(alpha = 0.5, #for elastic net, median between 0 and 1
lambda = seq(0.001,1,by = 0.001)))
list_tunes[[i]] <- coef(train$finalModel, train$finalModel$lambdaOpt)
}
Assess presence of each feature for each run
table_occurrence <-matrix(0,100,length(rownames(train_tpm)))
colnames(table_occurrence) <-rownames(train_tpm)
rownames(table_occurrence) <-paste0("run_",1:100)
for(i in 1:nrow(table_occurrence)){
tmp=list_tunes[[i]][,1][-1]
table_occurrence[i,names(tmp)] <-tmp
}
meanvalues <-table_occurrence
table_occurrence[table_occurrence != 0] <-1
tbO <-melt(table_occurrence)
ggplot(tbO, aes(y=Var2, x=Var1, fill=factor(value,levels = c("1","0"))))+
geom_tile()+
theme(axis.text.y = element_text(size=13))+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1,size=7))+
xlab("Runs")+
ylab("Genes")+
scale_fill_manual(name="value",values=c("1"="red", "0"="blue"))

(selected <-sort(colSums(table_occurrence)[colSums(table_occurrence) >= 50],decreasing = TRUE))
## TNFSF11 LOXL3 DNER TSPAN2 THBS3 ASPN HTRA1 DYSF
## 100 98 98 94 94 85 68 62
Select features with at least the 50% of occurrence and compute mean
coefficient values across the runs
selected1 <-names(selected)
meanvalues <-meanvalues[,selected1]
(cf <-colMeans(meanvalues))
## TNFSF11 LOXL3 DNER TSPAN2 THBS3 ASPN HTRA1
## 0.14553148 0.08586524 0.17203360 0.04224252 0.10231979 0.03194829 0.02917054
## DYSF
## 0.03668216
barplot(sort(cf,decreasing = TRUE), col = "dodgerblue", cex.names=1.2,font = 2, yaxt = "n")
axis(side = 2)

Compute score (sR) for Reference dataset , exploring also the
distribution of this score across the patients
ClassTRfull=as.factor(c(rep("N",10),rep("OA",60)))
score_df <-data.frame(patient=colnames(train_tpm), score=NA, class=ClassTRfull)
train_sc <-t(train_tpm[names(cf),])
sc <-rowSums(t(apply(train_sc,1,function(x) x*cf))) #compute score
rownames(score_df) <-score_df[,1]
score_df <-score_df[names(sc),]
score_df$score <-sc
score_df_train <-score_df
rm(score_df)
colnames(score_df_train)<-c("Patient","Score","Class")
ggplot(score_df_train, aes(x=Score,fill=Class)) +
geom_density(alpha=.7)+
xlab("score R")+
scale_fill_manual(values = c("plum","cyan"))+
theme_bw()

ggplot(score_df_train, aes(y=Score,x=Class, fill=Class))+
geom_boxplot(alpha=.6)+
stat_compare_means(cex=7,label.x=0.75,label.y = 3)+
geom_jitter((aes(colour = Class))) +
ylab("score R")+
scale_fill_manual(values = c("plum","cyan"))+
scale_color_manual(values = c("deeppink","turquoise"))+
theme_bw()+
theme(axis.text.y = element_text(size = 15),
axis.text.x = element_text(size = 15),
legend.title=element_text(size=16),
legend.text=element_text(size=14))+
guides(color = guide_legend(override.aes = list(size = 2)))

How does the sR discriminate between healthy and affected patients?
REFERENCE
col1 <-getGradientColors(score_df_train$Score, ccolors = c("deepskyblue1","dodgerblue","dodgerblue3","navy"), maxRange = max(score_df_train$Score)+ 0.01, minRange = min(score_df_train$Score)-0.01)
score_df_train$col1 <-col1
score_df_train <-score_df_train[order(score_df_train$Patient),]
score_df_train$col2 <-c(rep("plum",10),rep("cyan",60))
sum(up_genes$gene %in% rownames(train_tpm))
## [1] 38
sum(dw_genes$gene %in% rownames(train_tpm))
## [1] 5
sel <-c(up_genes$gene,dw_genes$gene)[c(up_genes$gene,dw_genes$gene) %in% rownames(train_tpm)] #43
train_tpm <-train_tpm[sel,]
annGenes <-data.frame(genes=sel, col3=c(rep("red",38),rep("blue",5)))
summ=summary(score_df_train$Score)
col_fun = colorRamp2(c(summ[1],summ[2],summ[5],summ[6]), c("#00BFFF","#197CDD","#0F4DB3","#000080"))
score_df_train <-score_df_train[order(score_df_train$Score,decreasing = FALSE),] #order the patients for increasing Score
head(score_df_train)
## Patient Score Class col1 col2
## MRI009 MRI009 1.340591 N #00BFFF plum
## MRI008 MRI008 1.443142 N #03B9FF plum
## MRI002 MRI002 1.489666 N #04B7FF plum
## MRI005 MRI005 1.532074 N #06B5FF plum
## MRI004 MRI004 1.672486 N #0AAEFF plum
## MRI006 MRI006 1.769805 N #0DAAFF plum
Heatmap: z-score computation of the data with patients sorted by
increasing score
train_tpm_scal <-t(apply(train_tpm,1,scale))
rownames(train_tpm_scal) <-rownames(train_tpm)
colnames(train_tpm_scal) <-colnames(train_tpm)
train_tpm_scal[train_tpm_scal < -3] <- -3
train_tpm_scal[train_tpm_scal > 3] <- 3
heatmap3(train_tpm_scal[annGenes$genes,rownames(score_df_train)],
Colv=NA,
scale="none",
col=colorRampPalette(c("royalblue", "white", "red1"))(1024),
RowAxisColors = 1,
balanceColor = TRUE,
margins = c(15,15),
ColSideColors = as.matrix(score_df_train[,c(4,5)]),
ColSideLabs = c("score R","Status"),
RowSideColors = annGenes$col3,
RowSideLabs = "Consensus",
cexRow = 1, cexCol = 1)
legend(0.4,1,legend=c("OA","Normal"),cex=2,
fill=c("cyan","plum"), bty = "n", title = "Class")
legend(0.5,1,legend=c("UP","DOWN"),cex=2,
fill=c("red","blue"), bty = "n", title = "DE")
draw(Legend(col_fun = col_fun, title = "score R"), x = unit(19, "in"), y = unit(13, "in"))

Compute score (sR) for Validation dataset, exploring also the
distribution of this score across the patients
ClassV=factor(c(rep("N",18),rep("OA",20)))
score_df <-data.frame(patient=colnames(validation_tpm), score=NA, class=ClassV)
val_sc <-t(validation_tpm[names(cf),])
identical(names(cf), colnames(val_sc)) #TRUE
## [1] TRUE
sc <-rowSums(t(apply(val_sc,1,function(x) x*cf)))
rownames(score_df) <-score_df[,1]
score_df <-score_df[names(sc),]
score_df$score <-sc
head(sc)
## Normal_Cart_10_8 Normal_Cart_2_2 Normal_Cart_3_3 Normal_Cart_4_4
## 2.665540 3.051013 2.073371 3.084911
## Normal_Cart_5_5 Normal_Cart_6_6
## 1.887718 2.122148
score_df_val <-score_df
rm(score_df)
colnames(score_df_val)<-c("Patient","Score","Class")
ggplot(score_df_val, aes(x=Score,fill=Class)) +
geom_density(alpha=.7)+
xlab("score R")+
scale_fill_manual(values = c("plum","cyan"))+
theme_bw()

ggplot(score_df_val, aes(y=Score,x=Class, fill=Class))+
geom_boxplot(alpha=.6)+
stat_compare_means(cex=7,label.x=0.75,label.y = 4)+
geom_jitter((aes(colour = Class))) +
ylab("score R")+
scale_fill_manual(values = c("plum","cyan"))+
scale_color_manual(values = c("deeppink","turquoise"))+
theme_bw()+
theme(axis.text.y = element_text(size = 15),
axis.text.x = element_text(size = 15),
legend.title=element_text(size=16),
legend.text=element_text(size=14))+
guides(color = guide_legend(override.aes = list(size = 2)))

How does the sR discriminate between healthy and affected patients?
VALIDATION
col1 <-getGradientColors(score_df_val$Score, ccolors = c("deepskyblue1","dodgerblue","dodgerblue3","navy"), maxRange = max(score_df_val$Score)+ 0.01, minRange = min(score_df_val$Score)-0.01)
score_df_val$col1 <-col1
score_df_val <-score_df_val[order(score_df_val$Patient),]
score_df_val$col2 <-c(rep("plum",18),rep("cyan",20))
validation_tpm <-validation_tpm[sel,]
summ2=summary(score_df_val$Score)
col_fun2 = colorRamp2(c(summ2[1],summ2[2],summ2[5],summ2[6]), c("#00BFFF","#1C88F2","#0D40AB","#000080"))
score_df_val <-score_df_val[order(score_df_val$Score,decreasing = FALSE),] #order the patients for increasing Score
head(score_df_val)
## Patient Score Class col1 col2
## normal_06 normal_06 0.6873741 N #00BFFF plum
## normal_02 normal_02 1.1900648 N #0BACFF plum
## normal_04 normal_04 1.2291431 N #0CABFF plum
## normal_10 normal_10 1.4077748 N #10A5FF plum
## Normal_Cart_5_5 Normal_Cart_5_5 1.8877178 N #1B93FF plum
## normal_07 normal_07 1.9662685 N #1D91FF plum
Heatmap: z-score computation of the data with patients sorted by
increasing score
validation_tpm_scal <-t(apply(validation_tpm,1,scale))
rownames(validation_tpm_scal) <-rownames(validation_tpm)
colnames(validation_tpm_scal) <-colnames(validation_tpm)
validation_tpm_scal[validation_tpm_scal < -3] <- -3
validation_tpm_scal[validation_tpm_scal > 3] <- 3
heatmap3(validation_tpm_scal[annGenes$genes,rownames(score_df_val)],
Colv=NA,
scale="none",
col=colorRampPalette(c("royalblue", "white", "red1"))(1024),
RowAxisColors = 1,
balanceColor = TRUE,
margins = c(10,10),
ColSideColors = as.matrix(score_df_val[,c(4,5)]),
ColSideLabs = c("score R","Status"),
RowSideColors = annGenes$col3,
RowSideLabs = "Consensus",
cexRow = 1, cexCol = 1)
legend(0.4,1,legend=c("OA","Normal"),cex=2,
fill=c("cyan","plum"), bty = "n", title = "Class")
legend(0.5,1,legend=c("UP","DOWN"),cex=2,
fill=c("red","blue"), bty = "n", title = "DE")
draw(Legend(col_fun = col_fun2, title = "score R"), x = unit(19, "in"), y = unit(13, "in"))

RIDGE model (all the available features,43)
Train
Tune the model
validation_tpm <-validation_tpm_forRidge[rownames(validation_tpm_forRidge) %in% c(up_genes$gene,dw_genes$gene),]
train_tpm <-train_tpm_forRidge[rownames(train_tpm_forRidge) %in% rownames(validation_tpm),]
trainCtrl.rpcv <- trainControl(method = "LOOCV", number = 1,
verboseIter = TRUE, returnData = FALSE,
savePredictions = TRUE,
classProbs = TRUE)
Train the model and save the coefficients
nCores <- 100 #change number if needed
registerDoMC(nCores)
train_dataset <-t(train_tpm)
train <- train(train_dataset,
y = ClassTRfull,
method = "glmnet",
trControl = trainCtrl.rpcv,
metric = "Accuracy", allowParallel=TRUE,
tuneGrid = expand.grid(alpha = 0, #ridge
lambda = seq(0.001,1,by = 0.001)))
cf_all <-coef(train$finalModel, train$finalModel$lambdaOpt)[,1][-1]
Compute score (sT) for Reference dataset using coefficients from the
Ridge model
score_df_all_tr <-data.frame(patient=colnames(train_tpm), score=NA, class=ClassTRfull)
train_sc_all <-t(train_tpm[names(cf_all),])
sc1 <-rowSums(t(apply(train_sc_all,1,function(x) x*cf_all)))
rownames(score_df_all_tr) <-score_df_all_tr[,1]
score_df_all_tr <-score_df_all_tr[names(sc1),]
score_df_all_tr$score <-sc1
head(sc1)
## MRI001 MRI002 MRI003 MRI004 MRI005 MRI006
## 7.028785 2.452862 4.865943 3.555113 4.605602 3.928878
Compute score (sT) for Validation dataset using coefficients from
the Ridge model
score_df_all_val <-data.frame(patient=colnames(validation_tpm), score=NA, class=ClassV)
val_sc_all <-t(validation_tpm[names(cf_all),])
sc <-rowSums(t(apply(val_sc_all,1,function(x) x*cf_all)))
rownames(score_df_all_val) <-score_df_all_val[,1]
score_df_all_val <-score_df_all_val[names(sc),]
score_df_all_val$score <-sc
head(sc)
## Normal_Cart_10_8 Normal_Cart_2_2 Normal_Cart_3_3 Normal_Cart_4_4
## 5.669915 7.437420 3.869674 6.863030
## Normal_Cart_5_5 Normal_Cart_6_6
## 5.256593 6.175890
Compare the ROC curves on validation dataset to assess the efficacy
of the sR prediction and apply DeLong’s test
list_rocs <-list(rocobj_val,rocobj_val_all)
names(list_rocs) <-c("Score R","Score T")
auc <- lapply(list_rocs, function(x) round(x$auc,3))
auc[1]
## $`Score R`
## [1] 0.875
auc[2]
## $`Score T`
## [1] 0.922
(rocts <-roc.test(rocobj_val,rocobj_val_all))
##
## DeLong's test for two ROC curves
##
## data: rocobj_val and rocobj_val_all
## D = -0.66741, df = 69.259, p-value = 0.5067
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.8750000 0.9222222
ggroc(list_rocs, size = 2,legacy.axes = TRUE) +
theme_bw()+
annotate("text", 0.75, 0.44, label=expression("AUC 8 features - score s"["R"] : 0.875) ,size=7)+
annotate("text", 0.75,0.4, label=expression("AUC 43 features - score s"["T"] : 0.922) ,size=7)+
labs(x = "1 - Specificity",y = "Sensitivity",color='Models')+
theme(legend.title=element_text(size=16),legend.text=element_text(size=14))+
guides(Models = guide_legend(override.aes = list(size = 4)))

Save resulting scores for the two models and the two datasets
list_scores <-list(score_df_all_tr, score_df_all_val, score_df_train, score_df_val)
names(list_scores) <-c("scoreT Training","scoreT Valid","scoreR Training","scoreR Valid")
save(list_scores,file=paste0(MEDIAsave,"list_scores.RData"))